#setup----

#dvpt markdown necessary for theming
#remotes::install_github('rstudio/rmarkdown#1706')

##libs----
library("tidyverse")
library("readxl")
library("thematic")
# library('factoextra')
library('FactoMineR')
library('ggrepel')
library('ggiraph')
# library('showtext')

##auto theme----
thematic_rmd(
  bg = '#002B36', fg = '#EEE8D5', accent = 'auto',
  sequential = hcl.colors(n = 42, palette = 'viridis')
)

##operators----
`%notin%` <- Negate(`%in%`)
#data input----

IDS and CD clustering

All data processed in R 4.0.1.

Raw data processing

UV

All UV spectra (including buffers) were gathered in a single dataframe, wherein a cation variable indicates whether any given spectrum was recorded in presence of absence of cation. The nature of the salt (potassium or sodium) used for each sample was extracted from the IDS table file and matched by oligonucleotide name.

Two buffers were recorded on different days for each two salts. For each sample, the matching blank (same salt, same day) was subtracted. The resulting spectra were baseline-subtracted by subtracting the average absorbance value at \(\lambda > 320\) nm.

The data was finally normalized by convertion to molar extinction coefficients \(\varepsilon\) in \(M^{-1}cm^{-1}\), after extraction and matching of the corresponding concentrations C (converted to \(M\) units) from the IDS table file, and assuming \(l = 1\) cm:

\[\varepsilon = \frac{A}{l \times C}\]

#oligo list----
seq.table <- readxl::read_excel("raw/IDS/UV.xlsx",
                                sheet = 'seq')

#UV files----

## UV with cation----
UV.input.cation <- readxl::read_excel("raw/IDS/UV.xlsx",
                                      sheet = 'cation') %>% 
  add_column(cation = 'cation')

## UV without cation----
UV.input.no.cation <- readxl::read_excel("raw/IDS/UV.xlsx",
                                         sheet = 'no cation') %>% 
  add_column(cation = 'no cation')

# processing----
UV <- UV.input.cation %>% 
  pivot_longer(
    cols = 2:(ncol(UV.input.cation)-1),
    values_to = 'abs',
    names_to = 'oligo'
  ) %>% 
  rbind(
    UV.input.no.cation %>% 
      pivot_longer(
        cols = 2:(ncol(UV.input.no.cation)-1),
        values_to = 'abs',
        names_to = 'oligo'
      )
  ) %>% 
  mutate(abs = as.numeric(abs)) %>% 
  left_join(
    seq.table %>% 
      select(oligo, salt, CF, oligo.number),
    by = 'oligo'
  ) %>% 
  group_by(cation, wl) %>% 
  mutate(
    abs.blk = if_else(
      salt == 'K',
      if_else(
        oligo.number < 19,
        abs - abs[oligo == 'buffer.K.1'],
        abs - abs[oligo == 'buffer.K.2']
      ),
      if_else(
        oligo.number < 19,
        abs - abs[oligo == 'buffer.Na.1'],
        abs - abs[oligo == 'buffer.Na.2']
      )
    )
  ) %>% 
  group_by(oligo, cation, salt) %>% 
  mutate(abs.cor = abs.blk - mean(abs.blk[wl > 320]),
         eps = abs.cor/(1*CF/1E6)) %>% 
  ungroup() %>% 
  filter(
    oligo %notin% c('buffer.K.1', 'buffer.K.2', 'buffer.Na.1', 'buffer.Na.2')
  )

IDS

IDS were calculated for each oligonucleotide by subtraction of their processed ‘cation’ data from their ‘no cation’ data.

#IDS----
IDS <- UV %>% 
  group_by(oligo, wl) %>% 
  mutate(delta.eps = eps[cation == 'no cation'] - eps[cation == 'cation']) %>% 
  filter(cation == 'cation') %>% 
  select(oligo, wl, delta.eps, salt)

CD

The CD data (with buffers) was also collated into a single data table, and blank subtracted and baseline-corrected similarly to the UV data.

The ellipticities \(\theta\) in mdeg were then normalized to molar ellipticities \(\Delta \varepsilon\) in \(M^{-1}cm^{-1}\), after extraction and matching of the corresponding concentrations C (converted to \(M\) units) from the IDS table file, using:

\[\Delta \varepsilon = \frac{\theta}{32980 \times l \times C}\]

#file import----
##CD input file reader----

CD_reader <- function(oligo.name){
  input.data <- read_delim(paste0("raw/CD/CD-",oligo.name,".txt"), 
                           "\t", escape_double = FALSE, col_names = FALSE, 
                           trim_ws = TRUE) %>% 
    select(c(1,2))  %>% 
    magrittr::set_colnames(c('wl', oligo.name))
}

##initialization----

###read one file to get wavelength column length and content
dummy <- CD_reader(oligo.name = "1XAV")

###initialize loop result data frame
cd.binder <- data.frame(dummy[,1])

####prepare list of files to import
sample.list <- c(seq.table$oligo, "buffer.K.1", "buffer.K.2", "buffer.Na.1", "buffer.Na.2")

##loop----

###import new data in a new column
for (i in sample.list) {
  
  buf <- CD_reader(oligo.name = i)
  
  cd.binder <- cbind(cd.binder, buf[,2])
}

#processing----

cd <- cd.binder %>% 
  pivot_longer(
    cols = 2:(length(sample.list)+1),
    names_to = 'oligo',
    values_to = 'cd'
  ) %>% 
  left_join(
    seq.table %>% 
      select(oligo, salt, CF, oligo.number),
    by = 'oligo'
  ) %>% 
  group_by(wl) %>% 
  mutate(
    cd.blk = if_else(
      salt == 'K',
      if_else(
        oligo.number < 19,
        cd - cd[oligo == 'buffer.K.1'],
        cd - cd[oligo == 'buffer.K.2']
      ),
      if_else(
        oligo.number < 19,
        cd - cd[oligo == 'buffer.Na.1'],
        cd - cd[oligo == 'buffer.Na.2']
      )
    )
  ) %>% 
  group_by(oligo, salt) %>% 
  mutate(cd.cor = cd.blk - mean(cd.blk[wl > 320]),
         delta.eps = cd.cor/(32980*CF/1E6)) %>% 
  ungroup() %>% 
  filter(
    oligo %notin% c('buffer.K.1', 'buffer.K.2', 'buffer.Na.1', 'buffer.Na.2')
  ) %>% 
  select(oligo, wl, delta.eps, salt)

Processed data tables

UV

UV

CD

cd

IDS

IDS

Data plots

UV

Visually, some oligonucleotides seem to have fairly superimposed spectra, which is not boding well for IDSs, i.e. 5YEY, DT2-L2T4, Ran4, and 2LXQ. Otherwise, it looks ok.

#data plot----

## UV----

p.UV <- UV %>% 
  ggplot(
    aes(
      x = wl, y = eps,
      color = cation,
      group = cation,
      linetype = salt
    )
  ) +
  geom_line(
    size = 1,
    show.legend = T
  ) +
  facet_wrap(~oligo,
             ncol = 4) +
  labs(
    x = bquote(lambda~(nm)),
    y = bquote(epsilon~(M^-1*cm^-1))
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  )

p.UV
UV spectra in presence or absence of cation. Spectra acquired in presence of sodium are dashed.

UV spectra in presence or absence of cation. Spectra acquired in presence of sodium are dashed.

IDS

Most IDS signature look fine, with the infamous negative band at 295 nm. As expected from the UV spectra, 5YEY, DT2-L2T4, Ran4, and 2LXQ are pretty flat. 2M27 is not looking good either, and 201D is not very convincing but has at least some contribution at 295 nm.

##IDS----

p.IDS <- IDS %>% 
  filter(wl <= 320) %>% 
  ggplot(
    aes(
      x = wl, y = delta.eps,
      color = oligo,
      group = oligo,
      linetype = salt
    )
  ) +
  geom_vline(
    xintercept = 295,
    linetype = 'dashed'
  ) +
  geom_vline(
    xintercept = 275,
    linetype = 'dashed'
  ) +
  geom_vline(
    xintercept = 262,
    linetype = 'dashed'
  ) +
  geom_vline(
    xintercept = 245,
    linetype = 'dashed'
  ) +
  geom_hline(
    yintercept = 0
  ) +
  geom_line(
    size = 1,
    show.legend = TRUE
  ) +
  guides(
    color = 'none'
  ) +
  facet_wrap(~oligo,
             ncol = 4) +
  labs(
    x = bquote(lambda~(nm)),
    y = bquote(Delta*epsilon~(M^-1*cm^-1))
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  )

p.IDS
IDS signature of the panel. IDS determined from sodium-containing solutions are dashed. The vertical lines are placed at 245, 262, 275, and 295 nm.

IDS signature of the panel. IDS determined from sodium-containing solutions are dashed. The vertical lines are placed at 245, 262, 275, and 295 nm.

CD

All spectra look to have been correctly acquired and processed. 2xBlock2_T258-A and, to a lower extent, DT2-L2T4 have ‘weird’ signatures. The former has a negative band at 260 nm, and the latter seems to be polymorphic.

##CD----

p.cd <- cd %>% 
  filter(wl <= 320) %>% 
  ggplot(
    aes(
      x = wl, y = delta.eps,
      color = oligo,
      group = oligo,
      linetype = salt
    )
  ) +
  geom_hline(
    yintercept = 0
  ) +
  geom_line(
    size = 1,
    show.legend = TRUE
  ) +
  facet_wrap(~oligo,
             ncol = 4) +
  labs(
    x = bquote(lambda~(nm)),
    y = bquote(Delta*epsilon~(M^-1*cm^-1))
  ) +
  guides(
    color = 'none'
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  )  

p.cd
CD spectra of the panel. The spectra are dashed when acquired in sodium-containing solutions.

CD spectra of the panel. The spectra are dashed when acquired in sodium-containing solutions.

PCA and clustering

Data preparation

Before starting the PCA, a topology was attributed to each sequence based on my personal interpretation of the CD (this is what will be called human topology after), but it should be replaced by the NMR-solved topology eventually (where available). Note that this interpretation still holds some merits, because weird CD features may not be obvious when looking at a 3D structure.

## human interpretation of CD for reference----

human.topo <- data.frame(
  oligo = seq.table$oligo,
  topo = c('hybrid', 'hybrid', 'hybrid', 'hybrid', 'hybrid', 'antiparallel', 
           'parallel', 'weird', 'parallel', 'parallel', 'parallel', 'parallel',
           'parallel', 'parallel', 'weird', 'antiparallel', 'hybrid', 'antiparallel', 
           'antiparallel', 'antiparallel', 'antiparallel', 'antiparallel', 'antiparallel', 'antiparallel')
)

human.topo %>% 
  magrittr::set_colnames(c('oligonucleotide', 'topology'))

The PCA was performed on the CD data, or IDS data, and will be performed on both simultaneously when issues with the IDS are resolved (see below). To avoid a quantitative bias from one technique over another in the latter case, the number of data points was made identical by i) rescaling the x-axis between the same 220 and 320 nm range, and ii) removing non-integer wavelength from the CD data (the step was set at 0.5 nm vs 1.0 nm for UV vs. CD).

For IDS, the data from 5YEY, DT2-L2T4, Ran4, 2M27 and 2LXQ was excluded. Their IDS is qualitatively not satisfactory, and they end up forming a cluster (of bad data). 201D is also somewhat suspicious.

In any case, the data was switched from long to large format (explanations).

## training set----
training.ids <- IDS %>% 
  filter(oligo %notin% c('5YEY', 'DT2-L2T4', 'Ran4', '2M27', '2LXQ')) %>% 
  filter(wl <= 310,
         wl >= 220) %>% #IMPORTANT: CHANGES RESULTS
  mutate(wl = paste0("ids-", wl)) %>% 
  pivot_wider(
    names_from = wl,
    values_from = delta.eps
  )

# ncol(training.ids)

training.cd <- cd %>% 
  filter(wl <= 310,
         wl >= 220) %>% #IMPORTANT: CHANGES RESULTS
  filter(wl%%1 == 0) %>% #removing non integer wl to have the same number of data points than in IDS
  mutate(wl = paste0("cd-", wl)) %>% 
  pivot_wider(
    names_from = wl,
    values_from = delta.eps
  )

PCA on CD data

Parameters and quality of PCA

The PCA was performed with centering but no scaling (the data is already normalized), using the FactoMineR package (Lê, Josse, and Husson 2008). The coordinates of the first two dimensions were kept for k-means clustering (set manually to four clusters, i.e. one per basic topology and keeping an extra one for weird stuff).

# training <- training.cd

## PCA training----

### PCA calc----
pca.cd <- FactoMineR::PCA(
  X = training.cd[,-1] %>% 
    select(-salt),
  ncp = 2,
  scale.unit = FALSE,
  graph = FALSE
)

####k-means clustering----
#done on all 5 kept dimensions
pca.cd.km <- kmeans(pca.cd$ind$coord, centers = 4, nstart = 42) 

The Scree plot shows the proportion of the variance that can be attributed to a principal component. Here, more than 90% of the variance can be attributed to the first two principal components. The factor maps shows that almost all wavelengths are well described by the first two principal components (the sum of their square cosine is close to 1). The variables more poorly described by the first two principal components are the wavelengths around 220–230 nm and above 300 nm, which are typically not used to infer topologies. A few wavelengths around 250 nm are also not perfectly described, having a significant contribution of the third principal component (not shown).

####scree----
pca.cd.scree <- data.frame(pca.cd$eig) %>% 
  mutate(dim = 1:nrow(data.frame(pca.cd$eig))) %>% 
  # mutate(dim.cum.sum = cumsum(dim)) %>% 
  ggplot(aes(x = dim, y = percentage.of.variance)) +
  geom_bar(
    stat = 'identity',
    aes(fill = -dim),
    show.legend = FALSE
  ) +
  geom_point(
    aes(
      x = dim, 
      y = cumsum(percentage.of.variance)
    ), 
    size = 3
  ) +
  geom_line(
    aes(
      x = dim, 
      y = cumsum(percentage.of.variance)
    ),
    size = 1
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Variance contribution (%)',
    x = 'Dimension'
  )

pca.cd.scree
Scree plot, showing the contribution of principal components to the total variance (bars), and the corresponding cumulative variance (points and line).

Scree plot, showing the contribution of principal components to the total variance (bars), and the corresponding cumulative variance (points and line).

pca.cd.fac.map <- data.frame(pca.cd$var$cos2) %>% 
  mutate(var = rownames(data.frame(pca.cd$var$cos2))) %>% 
  mutate(var = as.numeric(substr(var, 4,6))) %>%
  pivot_longer(
    cols = 1:2,
    names_to = 'dim',
    values_to = 'cos2'
  ) %>% 
  mutate(
    dim = substr(dim, 5, 5)
  ) %>% 
  ggplot(
    aes(
      x = dim,
      y = var,
      size = cos2
    )
  ) +
  geom_point(color = "#2AA198",
             alpha = 0.5) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Variable',
    x = 'Dimension'
  ) +
  scale_size_continuous(limits = c(0,1),
                        range = c(0,5),
                        name = bquote(cos^2))


circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
  r = diameter / 2
  tt <- seq(0,2*pi,length.out = npoints)
  xx <- center[1] + r * cos(tt)
  yy <- center[2] + r * sin(tt)
  return(data.frame(cir.x = xx, cir.y = yy))
}

circle <- circleFun(c(0,0), 2, npoints = 100)


pca.cd.var.cor <- data.frame(pca.cd$var$cor) %>% 
  select(Dim.1, Dim.2) %>% 
  mutate(var = rownames(data.frame(pca.cd$var$cor))) %>% 
  left_join(
    data.frame(pca.cd$var$cos2) %>% 
      mutate(sum.cos2 = Dim.1 + Dim.2) %>% 
      select(sum.cos2) %>% 
      mutate(var = rownames(data.frame(pca.cd$var$cos2))),
    by = 'var'
  ) %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.2,
      color = sum.cos2
    )
  ) +
  geom_path(
    data = circle, 
    aes(x=cir.x, y=cir.y), 
    inherit.aes = FALSE
  ) +
  geom_segment(
    aes(
      x = 0, y = 0,
      xend = Dim.1, yend = Dim.2
    ),
    arrow = arrow(length = unit(0.1, "inches")
    ),
    show.legend = TRUE
  ) +
  geom_text_repel(
    aes(label = var),
    show.legend = FALSE
  ) +
  scale_x_continuous(limits = c(-1,1)) +
  scale_y_continuous(limits = c(-1,1)) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.key.width = unit(0.5, 'in'),
    legend.text = element_text(size = 11),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 2',
    x = 'Dimension 1'
  ) +
  scale_color_continuous(
    type = 'viridis',
    limits = c(0,1),
    name = bquote(sum~cos^2)
  )

patchwork::wrap_plots(pca.cd.fac.map + pca.cd.var.cor)
Left: Factor map of the square cosine of the variables for the first two components (larger points show a better representation quality). Right: Circle of correlation for the first two principal components. Variables close to the circle and with larger sum of square cosine are best described by these principal components

Left: Factor map of the square cosine of the variables for the first two components (larger points show a better representation quality). Right: Circle of correlation for the first two principal components. Variables close to the circle and with larger sum of square cosine are best described by these principal components

PCA results

The results are shown as coordinates in the first two dimensions, colored by k-means cluster, and shaped by ‘human topology.’

The clustering follows nicely the human topology parameter, with one exception: 5YEY is antiparallel but its CD tends towards those of hybrid topologies (it doesn’t have a clearly negative band at 260 nm). DT2-L2T4 is clustered with the hybrid G4s, as could be expected, yet it is on the outskirt of the cluster because of its seemingly non-pure signature (large 260 nm contribution). The weird 2xBlock2_T258-A is completely isolated (cluster of one), which is satisfying given its unique CD signature.

The contributions of the variables are shown below. As expected, the band at 290 nm is found in antiparallel G4s, the band at 260 nm in parallel G4s, and the hybrid G4s are in between with contributions at 290 nm and 275 nm. It forms a satisfying continuum of antiparallel to hybrid to parallel topologies from left to right.

####individuals coordinates----
pca.cd.coord <- data.frame(pca.cd$ind$coord) %>% 
  cbind(training.cd) %>% 
  add_column(km = pca.cd.km$cluster) %>% 
  left_join(human.topo,
            by = 'oligo') %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.2,
      color = factor(km),
      shape = factor(topo)
    )
  ) +
  geom_point(
    size = 3,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = oligo),
    label.size = 0.5,
    show.legend = FALSE
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 2',
    x = 'Dimension 1'
  )

#####variable coordinates----
pca.cd.var.coord <- data.frame(pca.cd$var$coord) %>% 
  select(Dim.1, Dim.2) %>% 
  mutate(var = rownames(data.frame(pca.cd$var$coord))) %>% 
  left_join(
    data.frame(pca.cd$var$cos2) %>% 
      mutate(sum.cos2 = Dim.1 + Dim.2) %>% 
      select(sum.cos2) %>% 
      mutate(var = rownames(data.frame(pca.cd$var$cos2))),
    by = 'var'
  ) %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.2,
      color = sum.cos2
    ),
    show.legend = FALSE
  ) +
  geom_segment(
    aes(
      x = 0, y = 0, 
      xend = Dim.1, yend = Dim.2
    ),
    arrow = arrow(length = unit(0.1, "inches")
    ),
    show.legend = TRUE
  ) +
  geom_text_repel(
    aes(label = var),
    show.legend = FALSE
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'right', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 2',
    x = 'Dimension 1'
  )


pca.cd.coord

pca.cd.var.coord

PCA on IDS data

Parameters and quality of PCA

The PCA was carried out exactly like for CD. The coordinates of the first three dimensions were kept for k-means clustering (set manually to three clusters, i.e. one per basic topology), but the results of this was not used hereafter.

# training <- training.cd

## PCA training----

### PCA calc----
pca.ids <- FactoMineR::PCA(
  X = training.ids[,-1] %>% 
    select(-salt),
  ncp = 3,
  scale.unit = FALSE,
  graph = FALSE
)

####k-means clustering----
#done on all 5 kept dimensions
pca.ids.km <- kmeans(
  pca.ids$ind$coord, 
  algorithm = 'Hartigan-Wong',
  centers = 3, 
  nstart = 42
) 

Contrary to CD, the scree plot for IDS shows a somewhat significant contribution of the third dimension, although around 90% remain attributed to the first two principal components. This contributions of the third dimension is particularly visible at around 225 and 290 nm, which might therefore be important for data analysis. Accordingly, the first two dimension have a poor quality of description at these wavelengths (see the correlation circle).

####scree----
pca.ids.scree <- data.frame(pca.ids$eig) %>% 
  mutate(dim = 1:nrow(data.frame(pca.ids$eig))) %>% 
  # mutate(dim.cum.sum = cumsum(dim)) %>% 
  ggplot(aes(x = dim, y = percentage.of.variance)) +
  geom_bar(
    stat = 'identity',
    aes(fill = -dim),
    show.legend = FALSE
  ) +
  geom_point(
    aes(
      x = dim, 
      y = cumsum(percentage.of.variance)
    ), 
    size = 3
  ) +
  geom_line(
    aes(
      x = dim, 
      y = cumsum(percentage.of.variance)
    ),
    size = 1
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Variance contribution (%)',
    x = 'Dimension'
  )

pca.ids.scree
Scree plot, showing the contribution of principal components to the total variance (bars), and the corresponding cumulative variance (points and line).

Scree plot, showing the contribution of principal components to the total variance (bars), and the corresponding cumulative variance (points and line).

pca.ids.fac.map <- data.frame(pca.ids$var$cos2) %>% 
  mutate(var = rownames(data.frame(pca.ids$var$cos2))) %>% 
  mutate(var = as.numeric(substr(var, 5,7))) %>%
  pivot_longer(
    cols = 1:3,
    names_to = 'dim',
    values_to = 'cos2'
  ) %>% 
  mutate(
    dim = substr(dim, 5, 5)
  ) %>% 
  ggplot(
    aes(
      x = dim,
      y = var,
      size = cos2
    )
  ) +
  geom_point(color = "#2AA198",
             alpha = 0.5) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Variable',
    x = 'Dimension'
  ) +
  scale_size_continuous(limits = c(0,1),
                        range = c(0,5),
                        name = bquote(cos^2))


circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
  r = diameter / 2
  tt <- seq(0,2*pi,length.out = npoints)
  xx <- center[1] + r * cos(tt)
  yy <- center[2] + r * sin(tt)
  return(data.frame(cir.x = xx, cir.y = yy))
}

circle <- circleFun(c(0,0), 2, npoints = 100)


pca.ids.var.cor <- data.frame(pca.ids$var$cor) %>% 
  select(Dim.1, Dim.2) %>% 
  mutate(var = rownames(data.frame(pca.ids$var$cor))) %>% 
  left_join(
    data.frame(pca.ids$var$cos2) %>% 
      mutate(sum.cos2 = Dim.1 + Dim.2) %>% 
      select(sum.cos2) %>% 
      mutate(var = rownames(data.frame(pca.ids$var$cos2))),
    by = 'var'
  ) %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.2,
      color = sum.cos2
    )
  ) +
  geom_path(
    data = circle, 
    aes(x=cir.x, y=cir.y), 
    inherit.aes = FALSE
  ) +
  geom_segment(
    aes(
      x = 0, y = 0,
      xend = Dim.1, yend = Dim.2
    ),
    arrow = arrow(length = unit(0.1, "inches")
    ),
    show.legend = TRUE
  ) +
  geom_text_repel(
    aes(label = var),
    show.legend = FALSE
  ) +
  scale_x_continuous(limits = c(-1,1)) +
  scale_y_continuous(limits = c(-1,1)) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.key.width = unit(0.5, 'in'),
    legend.text = element_text(size = 11),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 2',
    x = 'Dimension 1'
  ) +
  scale_color_continuous(
    type = 'viridis',
    limits = c(0,1),
    name = bquote(sum~cos^2)
  )

patchwork::wrap_plots(pca.ids.fac.map + pca.ids.var.cor)
Left: Factor map of the square cosine of the variables for the first three components (larger points show a better representation quality). Right: Circle of correlation for the first two principal components. Variables close to the circle and with larger sum of square cosine are best described by these principal components

Left: Factor map of the square cosine of the variables for the first three components (larger points show a better representation quality). Right: Circle of correlation for the first two principal components. Variables close to the circle and with larger sum of square cosine are best described by these principal components

PCA results

PC 1 and 2

The PCA is less clear than with CD. Because the third principal component was found to be contributing significantly, the PCA coordinates are colored by topology and not by cluster as in the CD results (when doing so, the tridimensional clusters appear inconsistent with the two dimensional plot, because the latter misses a dimension).

Parallel G4s are clustered together, but are ‘contaminated’ by Hivpro1 (antiparallel), Hivpro3, and 6AC7 (hybrids). The latter two have fairly intense and well-resolved bands at 260 nm in CD. There may therefore be stacking contributions that the PCA on IDS picked up better than on CD. No such explanation can be given for Hivpro1, which has a beautifully antiparallel signature in CD. Its IDS does look like those of Hivpro3 and 6AC7, so it’s understandable it ends up here. An issue with data quality cannot be excluded.

All other antiparallel G4s are grouped in the top right quadrant, with a contamination from 186D. Although it is a hybrid G4, it does have a number of guanines in the loop that may provide additional stacks contributing to this result. Its CD has a particular signature, and among hybrid G4s, it was one of the closest to the antiparallel cluster in CD (although its CD does not look antiparallel!).

2GKU and 2JSL, both human telomeric hybrids, are very close (they were in the CD data as well) which is satisfying. Unfortunately, the three other hybrids are elsewhere as discussed above.

2xBlock2_T258-A is far from the other oligos, corroborating the CD results.

####individuals coordinates----
pca.ids.coord <- data.frame(pca.ids$ind$coord) %>% 
  cbind(training.ids) %>% 
  add_column(km = pca.ids.km$cluster) %>% 
  left_join(human.topo %>% 
              filter(oligo %notin% c('5YEY', 'DT2-L2T4', 'Ran4', '2M27', '2LXQ')),
            by = 'oligo') %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.2,
      color = factor(topo),
      shape = factor(topo)
    )
  ) +
  geom_point(
    size = 3,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = oligo),
    label.size = 0.5,
    show.legend = FALSE
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 2',
    x = 'Dimension 1'
  )

#####variable coordinates----
pca.ids.var.coord <- data.frame(pca.ids$var$coord) %>% 
  select(Dim.1, Dim.2) %>% 
  mutate(var = rownames(data.frame(pca.ids$var$coord))) %>% 
  left_join(
    data.frame(pca.ids$var$cos2) %>% 
      mutate(sum.cos2 = Dim.1 + Dim.2) %>% 
      select(sum.cos2) %>% 
      mutate(var = rownames(data.frame(pca.ids$var$cos2))),
    by = 'var'
  ) %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.2,
      color = sum.cos2
    ),
    show.legend = FALSE
  ) +
  geom_segment(
    aes(
      x = 0, y = 0, 
      xend = Dim.1, yend = Dim.2
    ),
    arrow = arrow(length = unit(0.1, "inches")
    ),
    show.legend = TRUE
  ) +
  geom_text_repel(
    aes(label = var),
    show.legend = FALSE
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'right', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 2',
    x = 'Dimension 1'
  )


pca.ids.coord

pca.ids.var.coord

PC 1 and 3

Below is the result of the same analysis, but plotting dimensions 1 and 3 rather than 1 and 2.

The parallel oligos are not contamined by 6AC7 and Hivpro3 anymore. The three other hybrids are also better grouped together, and the antiparallel are spread around them. Hivpro1 is still puzzling. 2xBlock2 is still alone in a corner.

The correlation at 295 nm is much better than with the dimension 2 (see the sum cos2), which explain the nice grouping of parallel G4s, but it is worse at around 240 nm.

####individuals coordinates----
pca.ids.coord <- data.frame(pca.ids$ind$coord) %>% 
  cbind(training.ids) %>% 
  add_column(km = pca.ids.km$cluster) %>% 
  left_join(human.topo %>% 
              filter(oligo %notin% c('5YEY', 'DT2-L2T4', 'Ran4', '2M27', '2LXQ')),
            by = 'oligo') %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.3,
      color = factor(topo),
      shape = factor(topo)
    )
  ) +
  geom_point(
    size = 3,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = oligo),
    label.size = 0.5,
    show.legend = FALSE
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'bottom', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 3',
    x = 'Dimension 1'
  )

#####variable coordinates----
pca.ids.var.coord <- data.frame(pca.ids$var$coord) %>% 
  select(Dim.1, Dim.3) %>% 
  mutate(var = rownames(data.frame(pca.ids$var$coord))) %>% 
  left_join(
    data.frame(pca.ids$var$cos2) %>% 
      mutate(sum.cos2 = Dim.1 + Dim.3) %>% 
      select(sum.cos2) %>% 
      mutate(var = rownames(data.frame(pca.ids$var$cos2))),
    by = 'var'
  ) %>% 
  ggplot(
    aes(
      x = Dim.1,
      y = Dim.3,
      color = sum.cos2
    ),
    show.legend = FALSE
  ) +
  geom_segment(
    aes(
      x = 0, y = 0, 
      xend = Dim.1, yend = Dim.3
    ),
    arrow = arrow(length = unit(0.1, "inches")
    ),
    show.legend = TRUE
  ) +
  geom_text_repel(
    aes(label = var),
    show.legend = FALSE
  ) +
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 13),
    legend.position = 'right', 
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 14)
  ) +
  labs(
    y = 'Dimension 3',
    x = 'Dimension 1'
  )


pca.ids.coord

pca.ids.var.coord

PC 1, 2 and 3

In an attemps to get an optimal separation of topologies, the coordinates in all three dimensions were plotted.

[This is an interactive plot, feel free to rotate it, zoom in, and check oligo names by hovering over the markers]

The parallel G4s are well clustered together, 2xBlock is clearly an outlier, and the antiparallel G4s are better separated from the hybrid ones. Hivpro1 is also well removed from the parallel cluster on the z (dimension 3) axis. 186D is still within the hybrids.

Conceivably, a tridimensional shape could be drawn for each topology with little overlap with other topologies.

library(plotly)

ply <- data.frame(pca.ids$ind$coord) %>%
  cbind(training.ids) %>%
  left_join(human.topo %>%
              filter(oligo %notin% c('5YEY', 'DT2-L2T4', 'Ran4', '2M27', '2LXQ')),
            by = 'oligo') %>%
  # filter(oligo != '2xBlock2_T258-A') %>%
  select(oligo, Dim.1, Dim.2, Dim.3, topo)


plot_ly(
  x = ply$Dim.1, y = ply$Dim.2, z = ply$Dim.3,
  type = "scatter3d",
  mode = "markers",
  color = ply$topo,
  text = ply$oligo
) %>% 
  layout(plot_bgcolor='rgb(254, 247, 234)') %>% 
  layout(paper_bgcolor='transparent') %>% 
  layout(font = list(color = '#FFFFFF')) 
#  
# plot_ly(
#   x = ply.p$Dim.1, y = ply.p$Dim.2, z = ply.p$Dim.3,
#   type = "mesh3d",
#   color = ply$topo,
#   text = ply$oligo
# )
# 
# ply

Final remarks

CD works really well.

IDS is more complex. Overall, it performs better to discriminate parallel G4s from hybrids and antiparallel than these two from one another. It seems that all may be discriminated by defining cluster volumes rather than areas.

A number of seemingly undesired behaviors (e.g. 186D clustering with antiparallel G4s) may in fact be beneficial features from using IDS on top of CD, but this will require to study further the expected stacking from guanines (or other nucleotides) from loops and flanking sequences.

Some data should probably be reacquired (see the comments on IDS signatures) to have the complete set.

The PCA and clustering may need some tweaking after that.

References

Lê, Sébastien, Julie Josse, and François Husson. 2008. “FactoMineR: An R Package for Multivariate Analysis.” Journal of Statistical Software 25 (1). https://doi.org/10.18637/jss.v025.i01.